#!/usr/bin/env python
#encoding: utf8
import os
import re
import sys
import socket
import struct
import logging
import jieba


def detect_encoding(text):
    targets = ['gb2312', 'gbk', 'utf8','utf-8','UTF-8','iso-8859-2','ISO-8859-2','ISO-8859-1','ascii']
    if type(text) == unicode:
        return 'utf8', text
    for code in targets:
        try:
            text_decode = text.decode(code)
            return code, text_decode
        except:
            pass
    return targets[0], text.decode(targets[0], 'ignore')

def format_html(html):
    code, html = detect_encoding(html)
    lines = html.split()
    content = u' '.join(lines)
    return content

def walkdir(rootdir):
    for root, dirs, files in os.walk(rootdir):
        for file in files:
            yield os.path.join(root, file)

def get_ip_address(ifname=None):
    if ifname is None:
        ifname = conf.network_interface
    if sys.platform == 'win32':
        return socket.gethostbyname(socket.gethostname())
    else:
        import fcntl
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        return socket.inet_ntoa(fcntl.ioctl(
            s.fileno(),
            0x8915,  # SIOCGIFADDR
            struct.pack('256s', ifname[:15])
        )[20:24])

def clean_html(html):
    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)

    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()

def segment(text):
    if type(text) == type(u''):
        text = text.encode("utf8")
    text = text.strip()
    words = [word for word in jieba.cut(text)]
    filter(lambda x : x != '', words)
    return ' '.join(words)

if __name__ == '__main__':
    with open(r'D:\mytemp\id_dsa.html') as f:
        html = f.read()
    html = """
    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML dir=rtl><HEAD><TITLE>By:越南邻国宰相</TITLE>
<META content="text/html; charset=gb2312" http-equiv=Content-Type>
<META content=en-us http-equiv=Content-Language>
<META name=GENERATOR content="MSHTML 8.00.6001.19088"></HEAD>
<BODY bgColor=#000000>
<P dir=ltr align=center><IMG border=0
src="http://www.xinbaoleyy.com/zaixiang.png" width=800 height=400></P>
<P dir=ltr align=center><FONT color=#ffffff face=Tahoma><B>Hacked
By</B></FONT></P>
<P dir=ltr align=center><FONT color=#ffffff size=6
face=Tahoma><B>Vietnam's prime minister neighbors</B></FONT></P>
<P dir=ltr align=center><FONT color=#ffffff size=2
face=Tahoma>中国1937集团军网络司令部（西北黑客基地）</FONT></P>
<P align=center><B><FONT color=#ffffff
size=4>爱心创造和谐，理性铸就成长；创新改变命运，激情成就梦想</FONT></B></P>
<DIV align=center>
<CENTER>
<TABLE
style="BORDER-BOTTOM: 1px dotted; BORDER-LEFT: 1px dotted; BORDER-COLLAPSE: collapse; BORDER-TOP: 1px dotted; BORDER-RIGHT: 1px dotted"
border=2 cellSpacing=0 borderColor=#333333 borderColorLight=#666666
cellPadding=0 width="31%">
  <TBODY>
  <TR>
    <TD dir=ltr bgColor=#000000 borderColor=#ffffff width="100%" align=middle>
      <P style="MARGIN: 3px 9px" dir=ltr align=left><FONT color=#c0c0c0 size=2
      face=Tahoma>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
      西北黑客基地召回令</FONT></P>
      <P style="MARGIN: 3px 9px" dir=ltr align=left><FONT color=#ffffff size=2
      face=Tahoma>本着对计算机的热爱我们曾今相聚于</FONT><FONT color=#ff0000 size=2
      face=Tahoma>西北黑客基地</FONT><FONT color=#c0c0c0 size=2 face=Tahoma>
      </FONT></P>
      <P style="MARGIN: 3px 9px" dir=ltr align=left><FONT color=#ffffff size=2
      face=Tahoma>现1937集团军网络司令部诚召旧部</FONT><FONT color=#ff0000 size=2
      face=Tahoma>落叶归根，重新起航</FONT></P>
      <P style="MARGIN: 3px 9px" dir=ltr align=left><FONT color=#ffffff size=2
      face=Tahoma>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
      &nbsp;QQ群：5448320</FONT></P>
      <P style="MARGIN: 3px 9px" dir=ltr align=left><FONT color=#ffffff size=2
      face=Tahoma>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
      &nbsp;&nbsp;YY频道：400958</FONT></P></TD></TR></TBODY></TABLE></CENTER></DIV>
<P dir=ltr align=center><FONT color=#ffffff face=Tahoma><SPAN
lang=en-us><B>China</B>&nbsp;is <B>H</B>ack[&nbsp;<U><B> <A
href="http://www.1937cN.CoM">WwW.1937cN.CoM</A></B></U>&nbsp;]</SPAN></FONT></P>
<P dir=ltr align=center><FONT color=#ffffff><SPAN lang=en-us><B><FONT
size=4></FONT></B></SPAN></FONT></P>
<P dir=ltr align=center><FONT color=#ffffff>我们每天所面对的是什么？&nbsp;</FONT>
<P dir=ltr align=center><FONT color=#ffffff>( ......生活的压力&nbsp; </FONT><FONT
color=#ff0000>人性的冷漠<SPAN lang=en-us>&nbsp;</SPAN></FONT><FONT color=#ffffff>
外敌的侵略...... )</FONT>
<P dir=ltr align=center><SPAN lang=en-us><FONT
color=#ffffff>万里长城十亿兵&nbsp;国耻岂待儿孙平&nbsp;</FONT></SPAN>
<P dir=ltr align=center>
<OBJECT width=474 height=372><PARAM NAME="movie" VALUE="http://www.flash8.net/uploadflash/69/flash8net_68608.swf">
				              <embed src="http://www.flash8.net/uploadflash/69/flash8net_68608.swf"
type="application/x-shockwave-flash" width="425"
height="350"></embed></OBJECT></P>
<P dir=ltr align=center><B><FONT color=#999999 size=5 face="Arial Narrow"><A
href="http://www.1937cn.com"><FONT
color=#999999>中国1937集团军网络司令部官方网站</FONT></A></FONT></B></P>
<P dir=ltr align=center>&nbsp;</P>
<P dir=ltr align=center><B><FONT color=#ffffff size=6
face="Arial Narrow">上帝眷顾的是有权利的人</FONT></B></P>
<P dir=ltr align=center>&nbsp;</P>
<P dir=ltr align=center><B><FONT color=#214a6b size=5 face="Arial Narrow"><FONT
color=#ffffff>你.我来到世上却不曾被任何人眷顾</FONT></FONT></B></P>
<P dir=ltr align=center><B><FONT color=#214a6b size=5 face="Arial Narrow"><FONT
color=#ffffff>所以我们像小强一样的活着</FONT></FONT></B></P>
<P dir=ltr align=center><B><FONT color=#999999 size=5
face="Arial Narrow">------------------------------------------------------</FONT></B></P>
<P dir=ltr align=center>&nbsp;</P>
<P dir=ltr align=center><FONT color=#999999 size=5>{&nbsp;<B>生下来</B>
}</FONT></P>
<P dir=ltr align=center><FONT color=#999999
size=5>------------------------活下去------------------------</FONT></P>
<P style="MARGIN-BOTTOM: -10px; MARGIN-LEFT: 170px; MARGIN-RIGHT: 170px" dir=ltr
align=center> </P><FONT color=#999999>
<P class=STYLE19 align=center>Copoyright @ www.1937cn.com All Rights
Reserved.2012_2013 E-mail:281633753@.qq.com</P></FONT></BODY></HTML>
"""

    w1 = segment(html)

    html = clean_html(html)
    w2 = segment(html)
    if ' ':
        print 'null'
